################################################
# Analysis for:
#
# Jordan, Soren, Hannah Paul, and Andrew Q. Philips.
#
# Required files: Eduskunta Speeches 1995-2019.dta
#
################################################

library(readstata13)
library(plm)
library(dplyr)
library(randomForest)
library(caret)
library(foreign)
library(glmnet)
library(e1071)
library(ggplot2)
library(neuralnet)
library(ranger)
library(sandwich)
library(lmtest)
library(zoo)
library(AER)
library(car)
library(carData)
library(survival)
library(Formula)
library(sfsmisc)
library(haven)
library(pdp)
library(ICEbox) # this package makes us lump all our X vars into an object
library(rdd)
library(ALEPlot)
library(scales)
library(MASS)
library(ggpubr)

data.pr <- read.dta13("/Users/scj0014/Myfiles/Dropbox/Interpreting ML Models/LSQ research note/Example 3/Eduskunta Speeches 1995-2019.dta")

head(data.pr)

######################################################
# Explicit dummy versions of terms
######################################################

data.pr$term2 <- ifelse(data.pr$term == 2, 1, 0)
data.pr$term2_fac <- factor(data.pr$term2)
table(data.pr$term2_fac)
is.factor(data.pr$term2_fac)

data.pr$term3 <- ifelse(data.pr$term == 3, 1, 0)
data.pr$term3_fac <- factor(data.pr$term3)
table(data.pr$term3_fac)
is.factor(data.pr$term3_fac)

data.pr$term4 <- ifelse(data.pr$term == 4, 1, 0)
data.pr$term4_fac <- factor(data.pr$term4)
table(data.pr$term4_fac)
is.factor(data.pr$term4_fac)

data.pr$term5 <- ifelse(data.pr$term == 5, 1, 0)
data.pr$term5_fac <- factor(data.pr$term5)
table(data.pr$term5_fac)
is.factor(data.pr$term5_fac)

data.pr$term6 <- ifelse(data.pr$term == 6, 1, 0)
data.pr$term6_fac <- factor(data.pr$term6)
table(data.pr$term6_fac)
is.factor(data.pr$term6_fac)

data.pr$term7 <- ifelse(data.pr$term == 7, 1, 0)
data.pr$term7_fac <- factor(data.pr$term7)
table(data.pr$term7_fac)
is.factor(data.pr$term7_fac)

######################################################
# Explicit dummy versions of party family
######################################################

data.pr$pf1 <- ifelse(data.pr$party_family == 1, 1, 0)
data.pr$pf1_fac <- factor(data.pr$pf1)
table(data.pr$pf1_fac)
is.factor(data.pr$pf1_fac)

# There are no 2s
# data.pr$pf2 <- ifelse(data.pr$party_family == 2, 1, 0)
# data.pr$pf2_fac <- factor(data.pr$pf2)
# table(data.pr$pf2_fac)
# is.factor(data.pr$pf2_fac)

data.pr$pf3 <- ifelse(data.pr$party_family == 3, 1, 0)
data.pr$pf3_fac <- factor(data.pr$pf3)
table(data.pr$pf3_fac)
is.factor(data.pr$pf3_fac)

data.pr$pf4 <- ifelse(data.pr$party_family == 4, 1, 0)
data.pr$pf4_fac <- factor(data.pr$pf4)
table(data.pr$pf4_fac)
is.factor(data.pr$pf4_fac)

data.pr$pf5 <- ifelse(data.pr$party_family == 5, 1, 0)
data.pr$pf5_fac <- factor(data.pr$pf5)
table(data.pr$pf5_fac)
is.factor(data.pr$pf5_fac)

data.pr$pf6 <- ifelse(data.pr$party_family == 6, 1, 0)
data.pr$pf6_fac <- factor(data.pr$pf6)
table(data.pr$pf6_fac)
is.factor(data.pr$pf6_fac)

data.pr$pf7 <- ifelse(data.pr$party_family == 7, 1, 0)
data.pr$pf7_fac <- factor(data.pr$pf7)
table(data.pr$pf7_fac)
is.factor(data.pr$pf7_fac)

data.pr$pf8 <- ifelse(data.pr$party_family == 8, 1, 0)
data.pr$pf8_fac <- factor(data.pr$pf8)
table(data.pr$pf8_fac)
is.factor(data.pr$pf8_fac)

data.pr$pf9 <- ifelse(data.pr$party_family == 9, 1, 0)
data.pr$pf9_fac <- factor(data.pr$pf9)
table(data.pr$pf9_fac)
is.factor(data.pr$pf9_fac)

######################################################
# Explicit dummy versiosn of everything else
######################################################

data.pr$oppmaj_fac <- factor(data.pr$oppmaj)
table(data.pr$oppmaj_fac)
is.factor(data.pr$oppmaj_fac)

data.pr$gender_fac <- factor(data.pr$gender)
table(data.pr$gender_fac)
is.factor(data.pr$gender_fac)

# Original data is wrong. Groupleader has a 2 when it should be
data.pr$groupleader[data.pr$groupleader == 2] <- 1
data.pr$groupleader_fac <- factor(data.pr$groupleader)
table(data.pr$groupleader_fac)
is.factor(data.pr$groupleader_fac)

data.pr$partyleader_fac <- factor(data.pr$partyleader)
table(data.pr$partyleader_fac)
is.factor(data.pr$partyleader_fac)

data.pr$committee_fac <- factor(data.pr$committee)
table(data.pr$committee_fac)
is.factor(data.pr$committee_fac)

data.pr$minister2_fac <- factor(data.pr$minister2)
table(data.pr$minister2_fac)
is.factor(data.pr$minister2_fac)

data.pr$oppmaj_fac <- factor(data.pr$oppmaj)
table(data.pr$oppmaj_fac)
is.factor(data.pr$oppmaj_fac)

# Create interaction in model
data.pr$intravulpartyscore <- data.pr$intravul*data.pr$partyscore1


######################################################
# Strict replication of original model. 
#  (H3, model (3) in the supplemental appendix)
######################################################

model.replicate <- glm.nb(Speeches ~ gender + intravul + intervul + partyscore1 + intravulpartyscore + 
	groupleader + partyleader + committee + minister2 + oppmaj + size + seniority_term +
	exposure + term3 + term4 + term5 + term6 + term7 + pf3 + pf4 + pf5 + pf6 +
	pf7 + pf8 + pf9, data = data.pr)

summary(model.replicate)

cbind(round(coef(model.replicate), digits = 3), "&", 
	paste0("(", round(sqrt(diag(vcov(model.replicate))), digits = 3), ")"), "\\")

# Same thing with the factors, to pull model terms below
model.replicate.fac <- glm.nb(Speeches ~ gender_fac + intravul + intervul + 
	partyscore1 + intravulpartyscore + groupleader_fac + partyleader_fac + 
	committee_fac + minister2_fac + oppmaj_fac + size + seniority_term +
	exposure + term3_fac + term4_fac + term5_fac + term6_fac + term7_fac + 
	pf3_fac + pf4_fac + pf5_fac + pf6_fac + pf7_fac + pf8_fac + pf9_fac, data = data.pr)

summary(model.replicate.fac)

# Stripping the names, are they the same?
identical(cbind(coef(model.replicate), coef(model.replicate.fac))[,1], 
				cbind(coef(model.replicate), coef(model.replicate.fac))[,2])

###############################################################################
############################# Random forest models ############################
###############################################################################
#########################################################
######################## Estimate #######################
#########################################################
####################################################
# Estimate (no interactions, non-linear ML model!) #
####################################################

# Take the coefficients from the factor model, exluding the interactions
pr.rf.X.nointaxn <- attr(model.replicate.fac$terms, which = "term.labels")[!(attr(model.replicate.fac$terms, which = "term.labels") %in% 
						c("intravulpartyscore"))]

# Frame the data, including the Speeches variable
pr.rf.data.nointaxn <- data.pr[names(data.pr) %in% c(pr.rf.X.nointaxn, "Speeches")]

# Remove missing observations
pr.rf.data.nona.nointaxn <- na.omit(pr.rf.data.nointaxn)

# Set hyperparameters using ranger. You MUST create this.
makeHyper <- function(min.mtry, max.mtry, mtry.increment, 
					min.node, max.node, node.increment,
					min.tree, max.tree, tree.increment) {
	combinations <- expand.grid(mtry = seq(min.mtry, max.mtry, mtry.increment),
													node.size = seq(min.node, max.node, node.increment),
													tree.size = seq(min.tree, max.tree, tree.increment),
													OOB.RMSE = NA)
	combinations$comb <- 1:dim(combinations)[1]
	combinations
}

hyperparameter.grid.PR <- makeHyper(min.mtry = 2, max.mtry = 12, mtry.increment = 2,
									min.node = 2, max.node = 11, node.increment = 2,
									min.tree = 200, max.tree = 2000, tree.increment = 100)

# Choose one: if you're running a new model, you'd want to run this. It's searching over the parameter
#  combinations and calculating the RMSE for each. We'll end up using the minimum RMSE
#  So normally, we'd 
# [1] Run the grid search, which might take a while, and pull the parameters from there.
#
# set.seed(27)
# for(i in 1:nrow(hyperparameter.grid.PR)) {
#	the.model <- ranger(formula = Speeches  ~ ., 
#						data = pr.rf.data.nona.nointaxn,
#						num.trees = hyperparameter.grid.PR$tree.size[i],
#						mtry = hyperparameter.grid.PR$mtry[i],
#						min.node.size = hyperparameter.grid.PR$node.size[i])
#	hyperparameter.grid.PR$OOB.RMSE[i] <- sqrt(the.model$prediction.error)
# }
#
# And then set the combination with the minimum RMSE.
# hyperparameter.grid.PR[which(hyperparameter.grid.PR$OOB.RMSE == min(hyperparameter.grid.PR$OOB.RMSE)),]
#  mtry node.size tree.size OOB.RMSE comb
# 8    4         4       200 170.8642    8
#
# Assign it as the combination to use.
# the.combo <- hyperparameter.grid.PR[which(hyperparameter.grid.PR$OOB.RMSE == min(hyperparameter.grid.PR$OOB.RMSE)), "comb"]
#
# Since this takes a long time, though, we're going to use the previous results.
# [2] Rely on previous results to pull the relevant combination.
 the.combo <- 8


set.seed(120)
# Random forest
full.rf.pr.nointaxn <- randomForest(Speeches ~ ., 
							data = pr.rf.data.nona.nointaxn, 
							ntree = hyperparameter.grid.PR[hyperparameter.grid.PR$comb == the.combo,]$tree.size,
							mtry = hyperparameter.grid.PR[hyperparameter.grid.PR$comb == the.combo,]$mtry,
							nodesize = hyperparameter.grid.PR[hyperparameter.grid.PR$comb == the.combo,]$node.size,
							importance = TRUE)

#########################################################
########################## VIP ##########################
#########################################################
#########################
# VIP (single variable) #
#########################
# How to make a VIP.
# [1] Save the variable names and importance measures from the ML model
#     We will call this saved data ``ho.X.vars.nointaxn.vip.dat''
pr.X.vars.nointaxn.vip.dat <- data.frame(names = rownames(full.rf.pr.nointaxn$importance), 
								MDA = full.rf.pr.nointaxn$importance[,1]/full.rf.pr.nointaxn$importanceSD)

# [2] Add the mean decrease in accuracy measure to ``pr.X.vars.nointaxn.vip.dat'' as explicitly numeric
pr.X.vars.nointaxn.vip.dat$MDA <- as.numeric(pr.X.vars.nointaxn.vip.dat$MDA.MeanDecreaseAccuracy)

# [3] What do we want to plot and draw attention to?
#     [a] All variables. We're going to change the variable names for prettier plotting
pr.X.vars.nointaxn.vip.dat$Variable <- c("Size of Party", "Intra-party Vulnerability", 
						"Inter-party Vulnerability", "Party Score", "Seniority", "Exposure",
						paste0("Term", 3:7), paste0("Party", 3:9), "Opposition Major", "Female",
						"Group Leader", "Party Leader", "Committee Chair", "Minister")

#     [b] Ignore the fixed effects. We can also force ggplot to ignore the factor variables in plotting by passing a 
#         blank label where term and party are
pr.X.vars.nointaxn.vip.dat$Variable2 <- c("Size of Party", "Intra-party Vulnerability", 
						"Inter-party Vulnerability", "Party Score", "Seniority", "Exposure",
						rep(" ", length(3:7)), rep(" ", length(3:9)), "Opposition Major", "Female",
						"Group Leader", "Party Leader", "Committee Chair", "Minister")

#     [c] Color the key variables. Indicate, by variable name, which variables to change colors
pr.X.vars.nointaxn.vip.dat$pr.key.var <- "no"
pr.X.vars.nointaxn.vip.dat$pr.key.var[pr.X.vars.nointaxn.vip.dat$Variable == "Intra-party Vulnerability"] <- "yes"
pr.X.vars.nointaxn.vip.dat$pr.key.var[pr.X.vars.nointaxn.vip.dat$Variable == "Inter-party Vulnerability"] <- "yes"
pr.X.vars.nointaxn.vip.dat$pr.key.var[pr.X.vars.nointaxn.vip.dat$Variable == "Party Score"] <- "yes"
pr.X.vars.nointaxn.vip.dat$pr.key.var[pr.X.vars.nointaxn.vip.dat$Variable == "Opposition Major"] <- "yes"



# [4] Make an ordered version of importance, sorted by the value of the VIP metric
pr.X.vars.nointaxn.vip.dat.sort <- pr.X.vars.nointaxn.vip.dat %>% arrange(-MDA)


# [5] Pass to ggplot as normal. This version incldues the fixed effects and colors by key variable (yes/no in scale_fill_manual)
#     We're passing MDA: the MeanDecreaseAccuracy created above
pr.X.vars.nointaxn.vip.plot <- ggplot(pr.X.vars.nointaxn.vip.dat.sort, aes(x = reorder(Variable, -MDA), 
									y = MDA, fill = pr.key.var)) +
								geom_bar(stat = "identity") + 
								theme_minimal() + 
								theme(axis.text.x = element_text(angle=75, hjust = 1), legend.position = "none") +
								scale_fill_manual(values = c("yes" = "black", "no" = "grey50")) + 
								xlab("") +
								ylab("Mean Decrease Accuracy")

# Figure 8
# pdf("...")
pr.X.vars.nointaxn.vip.plot
 dev.off()



#########################################################
########################## PDP ##########################
#########################################################
#########################
# PDP (single variable) #
#########################

# How to make a PDP. Start with extremity
# [1] Define the prediction function
pr.X.vars.nointaxn.pdppred <- function(object, newdata) {
	predict(full.rf.pr.nointaxn, newdata)
}

# [2] Extract the pdp using the partial() function on the ML model. 
#     Change pred.var to the model variable we're interested in the PDP for
#     The pred.fun argument is the prediction function defined above
the.pdp.intravul <- partial(full.rf.pr.nointaxn, pred.var = "intravul", pred.fun = pr.X.vars.nointaxn.pdppred)

# [3] Pass to ggplot as normal. Notice, we can add a rug by passing the PDP through stat_summary
intravul.pdp <- ggplot() + 
					stat_summary(data = the.pdp.intravul, aes(x = intravul, y = yhat), 
						fun = mean, geom = "line", col = "black", size = 2) +
					theme_minimal() +
					xlab("Intra-party Vulnerability") +
					ylab("Predicted Value") + 
					geom_rug(data = pr.rf.data.nona.nointaxn, aes(x = intravul))

# Figure 9
# pdf("...")
intravul.pdp
 dev.off()



# Repeat for inter-party
the.pdp.intervul <- partial(full.rf.pr.nointaxn, pred.var = "intervul", pred.fun = pr.X.vars.nointaxn.pdppred)

intervul.pdp <- ggplot() + 
					stat_summary(data = the.pdp.intervul, aes(x = intervul, y = yhat), 
						fun = mean, geom = "line", col = "black", size = 2) +
					theme_minimal() +
					xlab("Inter-party Vulnerability") +
					ylab("Predicted Value") + 
					geom_rug(data = pr.rf.data.nona.nointaxn, aes(x = intervul))

# Figure 10
# pdf("...")
intervul.pdp
 dev.off()

 
 
# Repeat for partyscore1
the.pdp.party <- partial(full.rf.pr.nointaxn, pred.var = "partyscore1", pred.fun = pr.X.vars.nointaxn.pdppred)

party.pdp <- ggplot() + 
					stat_summary(data = the.pdp.party, aes(x = partyscore1, y = yhat), 
						fun = mean, geom = "line", col = "black", size = 2) +
					theme_minimal() +
					xlab("Party Score in District") +
					ylab("Predicted Value") + 
					geom_rug(data = pr.rf.data.nona.nointaxn, aes(x = partyscore1))

# Figure 12
# pdf("...")
 party.pdp
dev.off()




# [1] If we want to plot a PDP with an interaction, we just pass multiple variables to partial()
#     Notice, the prediction function is the same
#     Here, the predictors are continuous and catetorical
the.pdp.intravul.party <- partial(full.rf.pr.nointaxn, 
					pred.var = c("intravul", "partyscore1"), 
					pred.fun = pr.X.vars.nointaxn.pdppred, chull = TRUE)


# [2] The partial() object contains fits by value, so we will aggregate them to make a single point prediction
#     This time we will use tidyverse: grouping by the two variables we're making predictions on
the.pdp.intravul.party.group <- the.pdp.intravul.party  %>% group_by(intravul, partyscore1) %>%
						summarise(yhat = mean(yhat))

intravul.party.pdp <- ggplot(the.pdp.intravul.party.group) + 
						geom_tile(aes(x = intravul, y = partyscore1, fill = yhat)) +
						scale_fill_gradientn("Predicted Value", colours=c("grey10", "grey90")) +
						theme_minimal() + 
						xlab("Intra-party Vulnerability") + 
						ylab("Party Score") + 
						theme(legend.position = c(0.15, 0.80), legend.background = element_rect(fill = "white")) 

# Figure 11
# pdf("...")
intravul.party.pdp
 dev.off()



#########################################################
########################## ICE ##########################
#########################################################
#########################
# ICE (single variable) #
#########################

# [1] If we want to plot an ICE, we have to pay some attention to the number of lines. Here, the number
#     of y.hats is
length(unique(the.pdp.party$yhat.id))

# 1228, which we might reasonably fit on a plot

party.ice <- ggplot() + 
					geom_line(data = the.pdp.party, aes(x = partyscore1, y = yhat, group = yhat.id), alpha = 0.2) +
					stat_summary(data = the.pdp.party, aes(x = partyscore1, y = yhat), 
						fun = mean, geom = "line", col = "black", size = 2) +
					theme_minimal() +
					xlab("Party Score in District") +
					ylab("Predicted Value") + 
					geom_rug(data = pr.rf.data.nona.nointaxn, aes(x = partyscore1))

# Figure 13
# pdf("...")
party.ice
 dev.off()

